import scanpy as sc, pandas as pd, numpy as np
from utils.plotting import plot_obs_barchart, plot_obs_treemap
ANNDATA_FOLDER = 'datasets'
QUERY_DATASET_NAME = 'LCA'
ABS_FILE_PATH = f'{ANNDATA_FOLDER}/{QUERY_DATASET_NAME}/{QUERY_DATASET_NAME}.h5ad'
# "LCA.h5ad" file contains existing annotations using a human-in-the-loop strategy by the authors
query_adata = sc.read_h5ad(ABS_FILE_PATH)
fig = plot_obs_barchart(query_adata, max_unique=50, dataset_name=QUERY_DATASET_NAME)
fig.show()
fig = plot_obs_treemap(query_adata, max_unique=50, dataset_name=QUERY_DATASET_NAME)
fig.show()
C:\ProgramData\Anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
# query_adata.obs['cell_ontology_type'].unique().tolist()
# Trying to figure out- How was the LCA subsampled query data created?
# print(
# len(query_adata.obs['cell_ontology_type'].unique().tolist()),
# len(smartseq_adata.obs['cell_type'].unique().tolist()),
# len(tenx_adata.obs['cell_type'].unique().tolist()),
# len(set(smartseq_adata.obs['cell_type'].unique().tolist() + tenx_adata.obs['cell_type'].unique().tolist()))
# )
# cell_compartment_lookup_df = pd.concat([smartseq_adata.obs[['compartment','cell_type']], tenx_adata.obs[['compartment','cell_type']]]).drop_duplicates()
# cell_compartment_lookup_df.sort_values(by=['cell_type'])
from utils.utiltity_functions import run_external_script
PYTHON_CMD = 'python'
SCRIPT_NAME = 'celltypist/celltypist_prediction_pipeline.py'
MOUNT_GOOGLE_DRIVE = 'False'
EXISTING_ANNOTATIONS_COLUMN = ''
OUTPUT_PREDICTIONS_FILE = 'celltypist_preds.csv'
CELLTYPIST_MODEL_NAME = 'Human_Lung_Atlas.pkl'
args = f'--mount-google-drive "{MOUNT_GOOGLE_DRIVE}" \
--existing-annotations-column "{EXISTING_ANNOTATIONS_COLUMN}" \
--folder-name "{ANNDATA_FOLDER}" \
--dataset-name "{QUERY_DATASET_NAME}" \
--output-predictions-file "{OUTPUT_PREDICTIONS_FILE}" \
--model-name "{CELLTYPIST_MODEL_NAME}"'
run_external_script(PYTHON_CMD, SCRIPT_NAME, args)
Python OUTPUT:
2023041719
Python ERROR:
usage: celltypist_prediction_pipeline.py [-h]
[--mount-google-drive MOUNT_GOOGLE_DRIVE]
[--existing-annotations-column [EXISTING_ANNOTATIONS_COLUMN]]
[--folder-name [FOLDER_NAME]]
[--dataset-name [DATASET_NAME]]
[--output-predictions-file [OUTPUT_PREDICTIONS_FILE]]
[--model-name [MODEL_NAME]]
celltypist_prediction_pipeline.py: error: unrecognized arguments: --mount-google-drive "False" --existing-annotations-column "" --folder-name "datasets" --dataset-name "LCA" --output-predictions-file "celltypist_preds_temp.csv" --model-name "Human_Lung_Atlas.pkl"
If the script fails due to usage issues, try the following command directly in a terminal:
Python celltypist/celltypist_prediction_pipeline.py --mount-google-drive "False" --existing-annotations-column "" --folder-name "datasets" --dataset-name "LCA" --output-predictions-file "celltypist_preds_temp.csv" --model-name "Human_Lung_Atlas.pkl"
# RSCRIPT_CMD = '/N/soft/rhel7/r/4.1.1/lib64/R/bin/Rscript'
RSCRIPT_CMD = 'RScript'
SCRIPT_NAME = 'azimuth/azimuth_prediction_pipeline.R'
OUTPUT_PREDICTIONS_FILE = 'azimuth_preds.tsv'
'''
Choose one of Azimuth's references:
"adiposeref", "bonemarrowref", "fetusref", "heartref",
"humancortexref", "kidneyref", "lungref", "mousecortexref",
"pancreasref", "pbmcref", "tonsilref"
'''
REFERENCE = 'lungref'
args = f'{ANNDATA_FOLDER} {QUERY_DATASET_NAME} {OUTPUT_PREDICTIONS_FILE} {REFERENCE}'
print(RSCRIPT_CMD, SCRIPT_NAME, args)
run_external_script(RSCRIPT_CMD, SCRIPT_NAME, args)
RScript azimuth/azimuth_prediction_pipeline.R datasets LCA azimuth_preds_temp.tsv lungref Rscript OUTPUT: [1] "loaded Seurat" [1] "loaded Azimuth" [1] "loaded SeuratData" [1] "loaded patchwork" [1] "loaded logr" [1] "azimuth_preds_2023421719.log" [1] "logs/log/azimuth_preds_2023421719.log" [1] "2023-04-17 19:42:00 : datasets" [2] "2023-04-17 19:42:00 : LCA" [3] "2023-04-17 19:42:00 : azimuth_preds_temp.tsv" [4] "2023-04-17 19:42:00 : lungref" [1] "2023-04-17 19:42:00 : Initializing arguments" [1] "2023-04-17 19:42:00 : C:/Users/HP/Desktop/Vikrant/Github_Repositories/ct-ann-predictive-analytics" [1] "2023-04-17 19:42:00 : Loading the query dataset : datasets/LCA/LCA.h5ad" [1] "**** Loading the query dataset : datasets/LCA/LCA.h5ad" [1] "2023-04-17 19:42:18 : Loaded the query dataset" [1] "2023-04-17 19:42:18 : Running Azimuth using the reference [lungref] dataset." [1] "Error in dir.exists(reference) : invalid filename argument\nCalls: RunAzimuth -> RunAzimuth.Seurat -> dir.exists\n" [1] "Traceback:" [1] "3: dir.exists(reference)" [2] "2: RunAzimuth.Seurat(query_adata, reference = REFERENCE)" [3] "1: RunAzimuth(query_adata, reference = REFERENCE)" Rscript ERROR: Attaching SeuratObject Warning message: package 'Seurat' was built under R version 4.2.2 Registered S3 method overwritten by 'SeuratDisk': method from as.sparse.H5Group Seurat Attaching shinyBS ── Installed datasets ───────────────────────────────────── SeuratData v0.2.2 ── ✔ adiposeref 1.0.0 ✔ kidneyref 1.0.1 ✔ bonemarrowref 1.0.0 ✔ lungref 2.0.0 ✔ heartref 1.0.0 ────────────────────────────────────── Key ───────────────────────────────────── ✔ Dataset loaded successfully ❯ Dataset built with a newer version of Seurat than installed ❓ Unknown version of Seurat installed Warning message: package 'patchwork' was built under R version 4.2.2 Warning message: package 'logr' was built under R version 4.2.2 Error in dir.exists(reference) : invalid filename argument Calls: RunAzimuth -> RunAzimuth.Seurat -> dir.exists Execution halted If the script fails due to usage issues, try the following command directly in a terminal: Rscript azimuth/azimuth_prediction_pipeline.R datasets LCA azimuth_preds_temp.tsv lungref
Modularizing the PopV Tutorial is tricky. Keeping this aside for now.
# For Lung datasets, run the PopV tutorial ipynb file for now.